A machine learning pipeline to analyze employee data and predict voluntary insurance enrollment.
We aim to predict whether an employee will opt-in to a voluntary insurance plan using demographic and employment data. The dataset contains 10,000 synthetic records.
import pandas as pd
# Load dataset
df = pd.read_csv("employee_data.csv")
df.head()
| employee_id | age | gender | marital_status | salary | employment_type | region | has_dependents | tenure_years | enrolled | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10001 | 60 | Female | Single | 55122.97 | Part-time | West | No | 1.5 | 0 |
| 1 | 10002 | 50 | Female | Single | 89549.66 | Full-time | West | Yes | 12.8 | 1 |
| 2 | 10003 | 36 | Male | Divorced | 74145.66 | Part-time | Midwest | No | 3.8 | 0 |
| 3 | 10004 | 64 | Female | Married | 53877.83 | Full-time | Northeast | No | 3.3 | 0 |
| 4 | 10005 | 29 | Male | Single | 63404.63 | Contract | Midwest | Yes | 10.0 | 0 |
Analyzing patterns in enrollment by different groups.
df.describe(include="all")
| employee_id | age | gender | marital_status | salary | employment_type | region | has_dependents | tenure_years | enrolled | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 10000.00000 | 10000.0000 | 10000 | 10000 | 10000.000000 | 10000 | 10000 | 10000 | 10000.000000 | 10000.000000 |
| unique | NaN | NaN | 3 | 4 | NaN | 3 | 4 | 2 | NaN | NaN |
| top | NaN | NaN | Male | Married | NaN | Full-time | West | Yes | NaN | NaN |
| freq | NaN | NaN | 4815 | 4589 | NaN | 7041 | 2582 | 5993 | NaN | NaN |
| mean | 15000.50000 | 43.0020 | NaN | NaN | 65032.967907 | NaN | NaN | NaN | 3.967720 | 0.617400 |
| std | 2886.89568 | 12.2858 | NaN | NaN | 14923.958446 | NaN | NaN | NaN | 3.895488 | 0.486046 |
| min | 10001.00000 | 22.0000 | NaN | NaN | 2207.790000 | NaN | NaN | NaN | 0.000000 | 0.000000 |
| 25% | 12500.75000 | 33.0000 | NaN | NaN | 54714.342500 | NaN | NaN | NaN | 1.200000 | 0.000000 |
| 50% | 15000.50000 | 43.0000 | NaN | NaN | 65056.050000 | NaN | NaN | NaN | 2.800000 | 1.000000 |
| 75% | 17500.25000 | 54.0000 | NaN | NaN | 75053.687500 | NaN | NaN | NaN | 5.600000 | 1.000000 |
| max | 20000.00000 | 64.0000 | NaN | NaN | 120312.000000 | NaN | NaN | NaN | 36.000000 | 1.000000 |
df.isna().sum()
# non nan values
employee_id 0 age 0 gender 0 marital_status 0 salary 0 employment_type 0 region 0 has_dependents 0 tenure_years 0 enrolled 0 dtype: int64
import plotly.express as px
# Create bar charts using plotly.express
fig_gender = px.bar(
df.groupby('gender')['enrolled'].mean().reset_index(),
x='gender', y='enrolled',
title='Enrollment Rate by Gender',
labels={'enrolled': 'Enrollment Rate'}
)
fig_marital = px.bar(
df.groupby('marital_status')['enrolled'].mean().reset_index(),
x='marital_status', y='enrolled',
title='Enrollment Rate by Marital Status',
labels={'enrolled': 'Enrollment Rate'}
)
fig_employment = px.bar(
df.groupby('employment_type')['enrolled'].mean().reset_index(),
x='employment_type', y='enrolled',
title='Enrollment Rate by Employment Type',
labels={'enrolled': 'Enrollment Rate'}
)
fig_region = px.bar(
df.groupby('region')['enrolled'].mean().reset_index(),
x='region', y='enrolled',
title='Enrollment Rate by Region',
labels={'enrolled': 'Enrollment Rate'}
)
fig_dependents = px.bar(
df.groupby('has_dependents')['enrolled'].mean().reset_index(),
x='has_dependents', y='enrolled',
title='Enrollment Rate by Dependents',
labels={'enrolled': 'Enrollment Rate'}
)
# Show the figures
fig_gender.show()
fig_marital.show()
fig_employment.show()
fig_region.show()
fig_dependents.show()
df.corr(numeric_only=True)
| employee_id | age | salary | tenure_years | enrolled | |
|---|---|---|---|---|---|
| employee_id | 1.000000 | -0.006100 | 0.005034 | 0.001356 | 0.015004 |
| age | -0.006100 | 1.000000 | 0.003872 | 0.002220 | 0.268749 |
| salary | 0.005034 | 0.003872 | 1.000000 | -0.003574 | 0.366403 |
| tenure_years | 0.001356 | 0.002220 | -0.003574 | 1.000000 | -0.007480 |
| enrolled | 0.015004 | 0.268749 | 0.366403 | -0.007480 | 1.000000 |
fig_age = px.box(df, x='enrolled', y='age', points='all', title="Age Distribution by Enrollment")
fig_salary = px.box(df, x='enrolled', y='salary', points='all', title="Salary Distribution by Enrollment")
fig_tenure = px.box(df, x='enrolled', y='tenure_years', points='all', title="Tenure Distribution by Enrollment")
# Histograms to see distributions
fig_age_hist = px.histogram(df, x='age', color='enrolled', barmode='overlay', nbins=30, title="Age Histogram by Enrollment")
fig_salary_hist = px.histogram(df, x='salary', color='enrolled', barmode='overlay', nbins=30, title="Salary Histogram by Enrollment")
fig_tenure_hist = px.histogram(df, x='tenure_years', color='enrolled', barmode='overlay', nbins=30, title="Tenure Histogram by Enrollment")
# Binned average enrollment line plots
df['age_bin'] = pd.cut(df['age'], bins=10)
df['salary_bin'] = pd.cut(df['salary'], bins=10)
df['tenure_bin'] = pd.cut(df['tenure_years'], bins=10)
age_trend = df.groupby('age_bin')['enrolled'].mean().reset_index()
salary_trend = df.groupby('salary_bin')['enrolled'].mean().reset_index()
tenure_trend = df.groupby('tenure_bin')['enrolled'].mean().reset_index()
# Show all plots
fig_age.show()
fig_salary.show()
fig_tenure.show()
fig_age_hist.show()
fig_salary_hist.show()
fig_tenure_hist.show()
Using Logistic Regression and Random Forest to predict enrollment.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
X = df.drop(columns=["employee_id", "enrolled"])
y = df["enrolled"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
categorical = ["gender", "marital_status", "employment_type", "region", "has_dependents"]
numeric = ["age", "salary", "tenure_years"]
preprocessor = ColumnTransformer([
("num", StandardScaler(), numeric),
("cat", OneHotEncoder(drop='first'), categorical)
])
logreg_pipeline = Pipeline([
("preprocessor", preprocessor),
("model", LogisticRegression(max_iter=1000))
])
logreg_pipeline.fit(X_train, y_train)
logreg_pred = logreg_pipeline.predict(X_test)
logreg_prob = logreg_pipeline.predict_proba(X_test)[:, 1]
print("Logistic Regression Report:")
print(classification_report(y_test, logreg_pred))
print("ROC AUC:", roc_auc_score(y_test, logreg_prob))
rf_pipeline = Pipeline([
("preprocessor", preprocessor),
("model", RandomForestClassifier(n_estimators=100, random_state=42))
])
rf_pipeline.fit(X_train, y_train)
rf_pred = rf_pipeline.predict(X_test)
rf_prob = rf_pipeline.predict_proba(X_test)[:, 1]
print("Random Forest Report:")
print(classification_report(y_test, rf_pred))
print("ROC AUC:", roc_auc_score(y_test, rf_prob))
Logistic Regression Report:
precision recall f1-score support
0 0.87 0.85 0.86 765
1 0.91 0.92 0.92 1235
accuracy 0.90 2000
macro avg 0.89 0.89 0.89 2000
weighted avg 0.90 0.90 0.90 2000
ROC AUC: 0.9704935037442778
Random Forest Report:
precision recall f1-score support
0 1.00 1.00 1.00 765
1 1.00 1.00 1.00 1235
accuracy 1.00 2000
macro avg 1.00 1.00 1.00 2000
weighted avg 1.00 1.00 1.00 2000
ROC AUC: 1.0
Understanding what drives enrollment decisions using Random Forest.
import numpy as np
ohe_features = rf_pipeline.named_steps["preprocessor"].transformers_[1][1].get_feature_names_out(categorical)
feature_names = numeric + list(ohe_features)
importances = rf_pipeline.named_steps["model"].feature_importances_
indices = np.argsort(importances)[::-1]
for i in indices[:10]:
print(f"{feature_names[i]}: {importances[i]:.4f}")
salary: 0.2774 has_dependents_Yes: 0.2558 age: 0.1946 employment_type_Full-time: 0.1747 employment_type_Part-time: 0.0558 tenure_years: 0.0253 gender_Male: 0.0030 region_Northeast: 0.0022 marital_status_Married: 0.0022 region_West: 0.0022
import plotly.graph_objects as go
import numpy as np
import pandas as pd
# Extract feature names after preprocessing
ohe = logreg_pipeline.named_steps["preprocessor"].transformers_[1][1]
ohe_features = ohe.get_feature_names_out(categorical)
feature_names = numeric + list(ohe_features)
# Get coefficients from logistic regression
coefficients = logreg_pipeline.named_steps["model"].coef_[0]
# Create a DataFrame
coef_df = pd.DataFrame({
"Feature": feature_names,
"Coefficient": coefficients
}).sort_values("Coefficient")
# Assign colors based on coefficient sign
colors = ["red" if val < 0 else "green" for val in coef_df["Coefficient"]]
# Plot using Plotly
fig = go.Figure(go.Bar(
x=coef_df["Coefficient"],
y=coef_df["Feature"],
orientation='h',
marker=dict(color=colors)
))
fig.update_layout(
title="Logistic Regression Feature Importance (Coefficients)",
xaxis_title="Coefficient Value",
yaxis_title="Feature",
showlegend=False
)
Improve model performance via grid search.